FictionEro - Data Cleaning

Data Preparation

Code
library(tidyverse)
library(easystats)
library(patchwork)
library(ggside)
df <- read.csv("../data/rawdata_participants.csv") |> 
  mutate(across(everything(), ~ifelse(.x == "", NA, .x))) |>
  mutate(Experimenter = case_when(
    Language=="English" & Experimenter == "reddit7" ~ "Reddit (other)",
    Language=="English" & Experimenter == "reddit8" ~ "Reddit (other)",
    Language=="English" & Experimenter == "reddit1" ~ "Reddit (other)",
    .default = Experimenter
  ))

dftask <- read.csv("../data/rawdata_task.csv") |> 
  full_join(
    df[c("Participant", "Sex", "SexualOrientation")],
    by = join_by(Participant)
    )

The initial sample consisted of 483 participants (Mean age = 34.3, SD = 12.4, range: [18, 80]; Sex: 26.1% females, 72.9% males, 1.0% other; Education: Bachelor, 38.51%; Doctorate, 6.42%; High School, 27.74%; Master, 25.05%; Other, 1.66%; Primary School, 0.62%; Country: 24.64% USA, 13.66% UK, 12.42% France, 10.14% Colombia, 39.13% other).

Compute Scores

# Create Sexual "relevance" variable (Relevant, irrelevant, non-erotic)
dftask <- dftask |> 
  mutate(Relevance = case_when(
    Type == "Non-erotic" ~ "Non-erotic",
    Sex == "Male" & SexualOrientation == "Heterosexual" & Category == "Female" ~ "Relevant",
    Sex == "Female" & SexualOrientation == "Heterosexual" & Category == "Male" ~ "Relevant",
    Sex == "Male" & SexualOrientation == "Homosexual" & Category == "Male" ~ "Relevant",
    Sex == "Female" & SexualOrientation == "Homosexual" & Category == "Female" ~ "Relevant",
    # TODO: what to do with "Other"? 
    SexualOrientation %in% c("Bisexual", "Other") & Category %in% c("Male", "Female") ~ "Relevant",
    .default = "Irrelevant"
  )) 

Recruitment History

Code
# Consecutive count of participants per day (as area)
df |>
  mutate(Date = as.Date(Date, format = "%d/%m/%Y")) |> 
  group_by(Date, Language, Experimenter) |> 
  summarize(N = n()) |> 
  ungroup() |>
  # https://bocoup.com/blog/padding-time-series-with-r
  complete(Date, Language, Experimenter, fill = list(N = 0)) |> 
  group_by(Language, Experimenter) |>
  mutate(N = cumsum(N)) |>
  ggplot(aes(x = Date, y = N)) +
  geom_area(aes(fill=Experimenter)) +
  scale_y_continuous(expand = c(0, 0)) +
  labs(
    title = "Recruitment History",
    x = "Date",
    y = "Total Number of Participants"
  ) +
  facet_wrap(~Language, nrow=4, scales = "free_y") +
  see::theme_modern() 

Code
# Table
summarize(df, N = n(), .by=c("Language", "Experimenter")) |> 
  arrange(desc(N)) |> 
  gt::gt() |> 
  gt::opt_stylize() |> 
  gt::opt_interactive(use_compact_mode = TRUE) |> 
  gt::tab_header("Number of participants per recruitment source")
Number of participants per recruitment source

Feedback

Evaluation

The majority of participants found the study to be a “fun” experience. Interstingly, reports of “fun” were significantly associated with finding at least some stimuli arousing. Conversely, reporting “no feelings” was associated with finding the experiment “boring”.

Code
df |> 
  select(starts_with("Feedback"), -Feedback_Comments) |>
  pivot_longer(everything(), names_to = "Question", values_to = "Answer") |>
  group_by(Question, Answer) |> 
  summarise(prop = n()/nrow(df), .groups = 'drop') |> 
  complete(Question, Answer, fill = list(prop = 0)) |> 
  filter(Answer == "True") |> 
  mutate(Question = str_remove(Question, "Feedback_"),
         Question = str_replace(Question, "AILessArousing", "AI = Less arousing"),
         Question = str_replace(Question, "AIMoreArousing", "AI = More arousing"),
         Question = str_replace(Question, "CouldNotDiscriminate", "Hard to discriminate"),
         Question = str_replace(Question, "LabelsIncorrect", "Labels were incorrect"),
         Question = str_replace(Question, "NoFeels", "Didn't feel anything"),
         Question = str_replace(Question, "CouldDiscriminate", "Easy to discriminate"),
         Question = str_replace(Question, "LabelsReversed", "Labels were reversed")) |>
  mutate(Question = fct_reorder(Question, desc(prop))) |> 
  ggplot(aes(x = Question, y = prop)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(expand = c(0, 0), breaks= scales::pretty_breaks(), labels=scales::percent) +
  labs(x="Feedback", y = "Participants", title = "Feedback") +
  theme_modern(axis.title.space = 15) +
  theme(
    plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0),
    plot.subtitle = element_text(size = rel(1.2), vjust = 7),
    axis.text.y = element_text(size = rel(1.1)),
    axis.text.x = element_text(size = rel(1.1), angle = 45, hjust = 1),
    axis.title.x = element_blank()
  )

Code
cor <- df |> 
  select(starts_with("Feedback"), -Feedback_Comments) |> 
  mutate_all(~ifelse(.=="True", 1, 0)) |> 
  correlation(method="tetrachoric", redundant = TRUE) |> 
  correlation::cor_sort() |> 
  correlation::cor_lower()
For i = 2 j = 1  A cell entry of 0 was replaced with correct =  0.5.  Check your data!
For i = 2 j = 1  A cell entry of 0 was replaced with correct =  0.5.  Check your data!
Code
cor |> 
  mutate(val = paste0(insight::format_value(rho), format_p(p, stars_only=TRUE))) |>
  mutate(Parameter2 = fct_rev(Parameter2)) |>
  mutate(Parameter1 = fct_relabel(Parameter1, \(x) str_remove_all(x, "Feedback_")),
         Parameter2 = fct_relabel(Parameter2, \(x) str_remove_all(x, "Feedback_"))) |>
  ggplot(aes(x=Parameter1, y=Parameter2)) +
  geom_tile(aes(fill = rho), color = "white") +
  geom_text(aes(label = val), size = 3) +
  labs(title = "Feedback Co-occurence Matrix") +
  scale_fill_gradient2(
    low = "#2196F3",
    mid = "white",
    high = "#F44336",
    breaks = c(-1, 0, 1),
    guide = guide_colourbar(ticks=FALSE),
    midpoint = 0,
    na.value = "grey85",
    limit = c(-1, 1))  + 
  theme_minimal() +
  theme(legend.title = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1))

Comments

Code
data.frame(Language = df$Language,
           Source = df$Experimenter,
           Comments = trimws(df$Feedback_Comments)) |> 
  filter(!Comments %in% c(NA, "No", "Nope", "nope", "None", "na", "n/a", "Non")) |> 
  arrange(Language, Source) |>
  gt::gt() |> 
  gt::opt_stylize() |> 
  gt::opt_interactive(use_compact_mode = TRUE) 

Exclusion

outliers <- c(
  # "S206"  # Collapsed RTs in both phases
  # "S399"  # Negative Arousal-Valence correlations
  "S432"  # Only 0s for arousal (creates statistical problems)
  )
potentials <- list()

Mobile

Code
df |>
  ggplot(aes(x=Mobile, fill=Mobile)) +
  geom_bar() +
  geom_hline(yintercept=0.5*nrow(df), linetype="dashed") +
  theme_modern()

We removed 152 participants that participated with a mobile device.

Code
df <- filter(df, Mobile == "False")
dftask <- filter(dftask, Participant %in% df$Participant)

Experiment Duration

The experiment’s median duration is 24.61 min (50% CI [18.31, 25.96]).

Code
df |>
  mutate(Participant = fct_reorder(Participant, Experiment_Duration),
         Category = ifelse(Experiment_Duration > 60, "extra", "ok"),
         Duration = ifelse(Experiment_Duration > 60, 60, Experiment_Duration),
         Group = ifelse(Participant %in% outliers, "Outlier", "ok")) |>
  ggplot(aes(y = Participant, x = Duration)) +
  geom_point(aes(color = Group, shape = Category)) +
  geom_vline(xintercept = median(df$Experiment_Duration), color = "red", linetype = "dashed") +
  scale_shape_manual(values = c("extra" = 3, ok = 19)) +
  scale_color_manual(values = c("Outlier" = "red", ok = "black"), guide="none") +
  guides(color = "none", shape = "none") +
  ggside::geom_xsidedensity(fill = "#4CAF50", color=NA) +
  ggside::scale_xsidey_continuous(expand = c(0, 0)) +
  labs(
    title = "Experiment Completion Time",
    x = "Duration (in minutes)",
    y = "Participant"
  )  +
  theme_bw() +
  ggside::theme_ggside_void() +
  theme(ggside.panel.scale = .3,
        panel.border = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank())

Code
potentials$expe_duration <- arrange(df, Experiment_Duration) |>
  select(Participant, Experiment_Duration) |>
  head(5) 

Task Duration

Code
plot_hist <- function(dat) {
  dens <- rbind(
    mutate(bayestestR::estimate_density(filter(dftask, RT1 < 40 & RT2 < 40)$RT1), 
           Phase="Emotional ratings",
           y = y / max(y)),
    mutate(bayestestR::estimate_density(filter(dftask, RT1 < 40 & RT2 < 40)$RT2), 
           Phase="Reality rating",
           y = y / max(y))
  )
  
  dat |> 
    filter(RT1 < 40 & RT2 < 40) |>  # Remove very long RTs
    # mutate(Participant = fct_reorder(Participant, RT1)) |> 
    pivot_longer(cols = c(RT1, RT2), names_to = "Phase", values_to = "RT") |>
    mutate(Phase = ifelse(Phase == "RT1", "Emotional ratings", "Reality rating")) |>
    ggplot(aes(x=RT)) +
    geom_area(data=dens, aes(x=x, y=y, fill=Phase), alpha=0.33, position="identity") +
    geom_density(aes(color=Phase, y=after_stat(scaled)), linewidth=1.5) + 
    scale_x_sqrt(breaks=c(0, 2, 5, 10, 20)) +
    theme_minimal() +
    theme(axis.title.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.text.y = element_blank(),
          axis.line.y = element_blank()) +
    labs(title = "Distribution of Response Time for each Participant", x="Response time per stimuli (s)") +
    facet_wrap(~Participant, ncol=5, scales="free_y") +
    coord_cartesian(xlim = c(0, 25))
}
Code
plot_hist(dftask[dftask$Participant %in% df$Participant[1:60], ])

Code
plot_hist(dftask[dftask$Participant %in% df$Participant[61:120], ])

Code
plot_hist(dftask[dftask$Participant %in% df$Participant[121:180], ])

Code
plot_hist(dftask[dftask$Participant %in% df$Participant[181:240], ])

Code
plot_hist(dftask[dftask$Participant %in% df$Participant[241:264], ])

BAIT Questionnaire Duration

Code
df |>
  mutate(Participant = fct_reorder(Participant, BAIT_Duration),
         Category = ifelse(BAIT_Duration > 5, "extra", "ok"),
         Duration = ifelse(BAIT_Duration > 5, 5, BAIT_Duration),
         Group = ifelse(Participant %in% outliers, "Outlier", "ok")) |>
  ggplot(aes(y = Participant, x = Duration)) +
  geom_point(aes(color = Group, shape = Category)) +
  geom_vline(xintercept = median(df$BAIT_Duration), color = "red", linetype = "dashed") +
  scale_shape_manual(values = c("extra" = 3, ok = 19)) +
  scale_color_manual(values = c("Outlier" = "red", ok = "black"), guide="none") +
  guides(color = "none", shape = "none") +
  ggside::geom_xsidedensity(fill = "#9C27B0", color=NA) +
  ggside::scale_xsidey_continuous(expand = c(0, 0)) +
  labs(
    title = "Questionnaire Completion Time",
    x = "Duration (in minutes)",
    y = "Participant"
  )  +
  theme_bw() +
  ggside::theme_ggside_void() +
  theme(ggside.panel.scale = .3,
        panel.border = element_blank(),
        axis.ticks.y = element_blank(),
          axis.text.y = element_blank()) 

Response to Erotic Stimuli

Code
dat <- dftask |> 
  filter(Relevance %in% c("Relevant", "Non-erotic")) |> 
  group_by(Participant, Type) |> 
  summarise(Arousal = mean(Arousal), 
            Valence = mean(Valence),
            Enticement = mean(Enticement),
            .groups = "drop") |>
  pivot_wider(names_from = Type, values_from = all_of(c("Arousal", "Valence", "Enticement"))) |>
  transmute(Participant = Participant,
            Arousal = Arousal_Erotic - `Arousal_Non-erotic`,
            Valence = Valence_Erotic - `Valence_Non-erotic`,
            Enticement = Enticement_Erotic - `Enticement_Non-erotic`) |>
  filter(!is.na(Arousal)) |> 
  mutate(Participant = fct_reorder(Participant, Arousal)) 

dat |> 
  pivot_longer(-Participant) |> 
  mutate(Group = ifelse(Participant %in% outliers, "Outlier", "ok")) |> 
  ggplot(aes(x=value, y=Participant, fill=Group)) +
  geom_bar(aes(fill=value), stat = "identity") +
  scale_fill_gradient2(low = "#3F51B5", mid = "#FF9800", high = "#4CAF50", midpoint = 0) +
  # scale_fill_manual(values = c("Outlier" = "red", ok = "black"), guide="none") +
  theme_bw() +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank()) +
  labs(title = "Difference between Erotic and Neutral", x="Erotic - Neutral") +
  facet_wrap(~name, ncol=3, scales="free_x")

Code
potentials$emo_diff <- arrange(dat, Arousal) |>
  head(5)

Response Coherence

Code
dat <- dftask |> 
  filter(!Participant %in% c("S432")) |> 
  summarize(cor_ArVal = cor(Arousal, Valence),
            cor_ArEnt = cor(Arousal, Enticement),
            .by="Participant") |>
  mutate(Participant = fct_reorder(Participant, cor_ArVal)) 

dat |>
  pivot_longer(-Participant) |> 
  mutate(Group = ifelse(Participant %in% outliers, "Outlier", "ok")) |> 
  mutate(name = fct_relevel(name, "cor_ArVal", "cor_ArEnt"),
         name = fct_recode(name, "Arousal - Valence" = "cor_ArVal", "Arousal - Enticement" = "cor_ArEnt")) |>
  ggplot(aes(y = Participant, x = value, fill = Group)) +
  geom_bar(stat = "identity") +
  # scale_fill_gradient2(low = "#3F51B5", mid = "#FF9800", high = "#4CAF50", midpoint = 0) +
  scale_fill_manual(values = c("Outlier" = "red", ok = "black"), guide="none") +
  theme_bw() +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank()) +
  labs(title = "Difference between Erotic and Neutral", x="Erotic - Neutral") +
  facet_wrap(~name, ncol=3, scales="free_x")

Code
potentials$emo_cor <- arrange(dat, cor_ArVal) |>
  head(5)
Code
c(as.character(potentials$expe_duration$Participant), 
  as.character(potentials$emo_diff$Participant), 
  as.character(potentials$emo_cor$Participant)) |> 
  table()

S143 S203 S260 S301 S389 S397 S407 S420 S426 S429 S433 S434 S450 S460 S470 
   1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 

Sexual Profile

Sample

Code
df |>
  ggplot(aes(x = Sex)) +
  geom_bar(aes(fill = SexualOrientation)) +
  scale_y_continuous(expand = c(0, 0), breaks = scales::pretty_breaks()) +
  scale_fill_metro_d() +
  labs(x = "Biological Sex", y = "Number of Participants", title = "Sex and Sexual Orientation", fill = "Sexual Orientation") +
  theme_modern(axis.title.space = 15) +
  theme(
    plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0),
    plot.subtitle = element_text(size = rel(1.2), vjust = 7),
    axis.text.y = element_text(size = rel(1.1)),
    axis.text.x = element_text(size = rel(1.1)),
    axis.title.x = element_blank()
  )

We removed 11 participants that were incompatible with further analysis.

df <- filter(df, Sex != "Other" & SexualOrientation != "Other")
dftask <- filter(dftask, Participant %in% df$Participant)

Task Behaviour

Code
show_distribution <- function(dftask, target="Arousal") {
  dftask |> 
    filter(SexualOrientation %in% c("Heterosexual", "Bisexual", "Homosexual")) |>
    bayestestR::estimate_density(select=target, 
                                 at=c("Relevance", "Category", "Sex", "SexualOrientation"), 
                                 method="KernSmooth") |>
    ggplot(aes(x = x, y = y)) +
    geom_line(aes(color = Relevance, linetype = Category), linewidth=1) +
    facet_grid(SexualOrientation~Sex, scales="free_y")  +
    scale_color_manual(values = c("Relevant" = "red", "Non-erotic" = "blue", "Irrelevant"="darkorange")) +
    scale_y_continuous(expand = c(0, 0)) +
    scale_x_continuous(expand = c(0, 0)) +
    theme_minimal()  +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank(),
          axis.text.y = element_blank(),
          plot.title = element_text(face="bold")) +
    labs(title = target) 
}

(show_distribution(dftask, "Arousal") | show_distribution(dftask, "Valence")) /
  (show_distribution(dftask, "Enticement") | show_distribution(dftask, "Realness")) +
  patchwork::plot_layout(guides = "collect") +
  patchwork::plot_annotation(title = "Distribution of Appraisals depending on the Sexual Profile",
                             theme = theme(plot.title = element_text(hjust = 0.5, face="bold"))) 

We kept only heterosexual participants (79.69%).

df <- filter(df, SexualOrientation == "Heterosexual")
dftask <- filter(dftask, Participant %in% df$Participant)

Final Sample

Code
df <- filter(df, !Participant %in% outliers)
dftask <- filter(dftask, Participant %in% df$Participant)

The final sample includes 254 participants (Mean age = 35.5, SD = 13.5, range: [18, 80]; Sex: 26.8% females, 73.2% males, 0.0% other; Education: Bachelor, 35.43%; Doctorate, 7.09%; High School, 26.38%; Master, 28.74%; Other, 1.97%; Primary School, 0.39%; Country: 22.05% USA, 13.78% Colombia, 13.78% France, 12.99% UK, 37.40% other).

Code
p_country <- dplyr::select(df, region = Country) |>
  group_by(region) |>
  summarize(n = n()) |>
  right_join(map_data("world"), by = "region") |>
  ggplot(aes(long, lat, group = group)) +
  geom_polygon(aes(fill = n)) +
  scale_fill_gradientn(colors = c("#FFEB3B", "red", "purple")) +
  labs(fill = "N") +
  theme_void() +
  labs(title = "A Geographically Diverse Sample", subtitle = "Number of participants by country")  +
  theme(
    plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0),
    plot.subtitle = element_text(size = rel(1.2))
  )
p_country

Code
ggwaffle::waffle_iron(df, ggwaffle::aes_d(group = Ethnicity), rows=10) |> 
  ggplot(aes(x, y, fill = group)) + 
  ggwaffle::geom_waffle() + 
  coord_equal() + 
  scale_fill_flat_d() + 
  ggwaffle::theme_waffle() +
  labs(title = "Self-reported Ethnicity", subtitle = "Each square represents a participant", fill="")  +
  theme(
    plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0),
    plot.subtitle = element_text(size = rel(1.2)),
    axis.title.x = element_blank(),
    axis.title.y = element_blank()
  )
Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.

Code
p_age <- estimate_density(df$Age) |>
  normalize(select = y) |> 
  mutate(y = y * 86) |>  # To match the binwidth
  ggplot(aes(x = x)) +
  geom_histogram(data=df, aes(x = Age), fill = "#616161", bins=28) +
  # geom_line(aes(y = y), color = "orange", linewidth=2) +
  geom_vline(xintercept = mean(df$Age), color = "red", linewidth=1.5) +
  # geom_label(data = data.frame(x = mean(df$Age) * 1.15, y = 0.95 * 75), aes(y = y), color = "red", label = paste0("Mean = ", format_value(mean(df$Age)))) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0)) +
  labs(title = "Age", y = "Number of Participants", color = NULL, subtitle = "Distribution of participants' age") +
  theme_modern(axis.title.space = 10) +
  theme(
    plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0),
    plot.subtitle = element_text(size = rel(1.2), vjust = 7),
    axis.text.y = element_text(size = rel(1.1)),
    axis.text.x = element_text(size = rel(1.1)),
    axis.title.x = element_blank()
  )
p_age

Code
p_edu <- df |>
  mutate(Education = fct_relevel(Education, "Other", "Primary School", "High School", "Bachelor", "Master", "Doctorate")) |> 
  ggplot(aes(x = Education)) +
  geom_bar(aes(fill = Education)) +
  scale_y_continuous(expand = c(0, 0), breaks= scales::pretty_breaks()) +
  scale_fill_viridis_d(guide = "none") +
  labs(title = "Education", y = "Number of Participants", subtitle = "Participants per achieved education level") +
  theme_modern(axis.title.space = 15) +
  theme(
    plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0),
    plot.subtitle = element_text(size = rel(1.2), vjust = 7),
    axis.text.y = element_text(size = rel(1.1)),
    axis.text.x = element_text(size = rel(1.1)),
    axis.title.x = element_blank()
  )
p_edu

Birth Control

Code
colors <- c(
  "NA" = "#2196F3", "None" = "#E91E63", "Condoms (for partner)" = "#9C27B0",
  "Combined pills" = "#FF9800", "Intrauterine Device (IUD)" = "#FF5722", 
  "Intrauterine System (IUS)" = "#795548", "Progestogen pills" = "#4CAF50",
  "Other" = "#FFC107", "Condoms (female)" = "#607D8B"
)
colors <- colors[names(colors) %in% c("NA", df$BirthControl)]

p_sex <- df |>
  mutate(BirthControl = ifelse(Sex == "Male", "NA", BirthControl),
         BirthControl = fct_relevel(BirthControl, names(colors))) |>
  ggplot(aes(x = Sex)) +
  geom_bar(aes(fill = BirthControl)) +
  scale_y_continuous(expand = c(0, 0), breaks = scales::pretty_breaks()) +
  scale_fill_manual(
    values = colors,
    breaks = names(colors)[2:length(colors)]
  ) +
  labs(x = "Biological Sex", y = "Number of Participants", title = "Sex and Birth Control Method", fill = "Birth Control") +
  theme_modern(axis.title.space = 15) +
  theme(
    plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0),
    plot.subtitle = element_text(size = rel(1.2), vjust = 7),
    axis.text.y = element_text(size = rel(1.1)),
    axis.text.x = element_text(size = rel(1.1)),
    axis.title.x = element_blank()
  )
p_sex

Sexual Profile

Code
p_sexprofile <- df |>
  select(Participant, Sex, SexualOrientation, SexualActivity, COPS_Duration_1, COPS_Frequency_2) |> 
  pivot_longer(-all_of(c("Participant", "Sex"))) |> 
  mutate(name = str_replace_all(name, "SexualOrientation", "Sexual Orientation"),
         name = str_replace_all(name, "SexualActivity", "Sexual Activity"),
         name = str_replace_all(name, "COPS_Duration_1", "Pornography Usage (Duration)"),
         name = str_replace_all(name, "COPS_Frequency_2", "Pornography Usage (Frequency)")) |> 
  ggplot(aes(x = value, fill=Sex)) +
  geom_bar() +
  scale_y_continuous(expand = c(0, 0), breaks= scales::pretty_breaks()) +
  scale_fill_manual(values = c("Male"= "#64B5F6", "Female"= "#F06292")) +
  labs(title = "Sexual Profile of Participants") +
  theme_modern(axis.title.space = 15) +
  theme(
    plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0),
    plot.subtitle = element_text(size = rel(1.2), vjust = 7),
    axis.text.y = element_text(size = rel(1.1)),
    axis.text.x = element_text(size = rel(1.1), angle = 45, hjust = 1),
    axis.title.x = element_blank(),
    axis.title.y = element_blank()
  ) +
  facet_wrap(~name, scales = "free")
p_sexprofile

Code
p_language <- df |>
  ggplot(aes(x = Language_Level)) +
  geom_bar() +
  scale_y_continuous(expand = c(0, 0), breaks= scales::pretty_breaks()) +
  labs(x = "Level", y = "Number of Participants", title = "Language Level") +
  theme_modern(axis.title.space = 15) +
  theme(
    plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0),
    plot.subtitle = element_text(size = rel(1.2), vjust = 7),
    axis.text.y = element_text(size = rel(1.1)),
    axis.text.x = element_text(size = rel(1.1))
  )

p_expertise <- df |>
  ggplot(aes(x = AI_Knowledge)) +
  geom_bar() +
  scale_y_continuous(expand = c(0, 0), breaks= scales::pretty_breaks()) +
  labs(x = "Level", y = "Number of Participants", title = "AI-Expertise") +
  theme_modern(axis.title.space = 15) +
  theme(
    plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0),
    plot.subtitle = element_text(size = rel(1.2), vjust = 7),
    axis.text.y = element_text(size = rel(1.1)),
    axis.text.x = element_text(size = rel(1.1))
  )

p_language | p_expertise

Code
p_country /
  (p_age + p_edu)

Beliefs about Artificial Information Technology (BAIT)

This section pertains to the validation of the BAIT scale measuring beliefs and expectations about artificial creations.

Exploratory Factor Analysis

Code
bait <- df |> 
  select(starts_with("BAIT_"), -BAIT_Duration) |> 
  rename_with(function(x) gsub("BAIT_\\d_", "", x))


cor <- correlation::correlation(bait, redundant = TRUE) |> 
  correlation::cor_sort() |> 
  correlation::cor_lower()

clean_labels <- function(x) {
  x <- str_remove_all(x, "BAIT_") |> 
    str_replace_all("_", " - ")
}

cor |> 
  mutate(val = paste0(insight::format_value(r), format_p(p, stars_only=TRUE))) |>
  mutate(Parameter2 = fct_rev(Parameter2)) |>
  mutate(Parameter1 = fct_relabel(Parameter1, clean_labels),
         Parameter2 = fct_relabel(Parameter2, clean_labels)) |> 
  ggplot(aes(x=Parameter1, y=Parameter2)) +
  geom_tile(aes(fill = r), color = "white") +
  geom_text(aes(label = val), size = 3) +
  labs(title = "Correlation Matrix",
       subtitle = "Beliefs about Artificial Information Technology (BAIT)") +
  scale_fill_gradient2(
    low = "#2196F3",
    mid = "white",
    high = "#F44336",
    breaks = c(-1, 0, 1),
    guide = guide_colourbar(ticks=FALSE),
    midpoint = 0,
    na.value = "grey85",
    limit = c(-1, 1))  + 
  theme_minimal() +
  theme(legend.title = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1))

Code
n <- parameters::n_factors(bait, package = "nFactors")
plot(n)

Code
efa <- parameters::factor_analysis(bait, cor=cor(bait), n=2, rotation = "oblimin", sort=TRUE, scores="tenBerge", fm="ml")
plot(efa)

Code
display(efa)
Rotated loadings from Factor Analysis (oblimin-rotation)
Variable ML1 ML2 Complexity Uniqueness
TextRealistic 0.76 -9.01e-03 1.00 0.43
TextIssues -0.60 0.13 1.09 0.67
ImitatingReality 0.41 0.21 1.48 0.74
ImagesRealistic 0.37 0.30 1.93 0.71
ImagesIssues -0.30 -9.93e-03 1.00 0.91
VideosIssues -0.06 0.79 1.01 0.39
EnvironmentReal 0.31 0.39 1.90 0.68
VideosRealistic -0.13 -0.39 1.23 0.80

The 2 latent factors (oblimin rotation) accounted for 33.27% of the total variance of the original data (ML1 = 18.87%, ML2 = 14.40%).

Confirmatory Factor Analysis

Code
m1 <- lavaan::cfa(efa_to_cfa(efa, threshold="max"), data=bait)
m2 <- lavaan::cfa(
  "G =~ ImitatingReality + EnvironmentReal + VideosIssues + TextIssues + VideosRealistic + ImagesRealistic + ImagesIssues + TextRealistic", 
  data=bait)

# bayestestR::bayesfactor_models(m1, m2)
lavaan::anova(m1, m2)

Chi-Squared Difference Test

   Df      AIC    BIC   Chisq Chisq diff   RMSEA Df diff Pr(>Chisq)    
m1 19 -11.5963 48.538  97.135                                          
m2 20   2.6739 59.271 113.405      16.27 0.24519       1  5.492e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
display(parameters::parameters(m1))
# Loading
Link Coefficient SE 95% CI z p
ML1 =~ TextIssues 1.00 0.00 (1.00, 1.00) < .001
ML1 =~ TextRealistic -1.42 0.26 (-1.92, -0.91) -5.51 < .001
ML1 =~ ImitatingReality -1.21 0.24 (-1.67, -0.75) -5.13 < .001
ML1 =~ ImagesIssues 0.63 0.17 (0.30, 0.96) 3.77 < .001
ML1 =~ ImagesRealistic -0.76 0.15 (-1.05, -0.47) -5.21 < .001
ML2 =~ EnvironmentReal 1.00 0.00 (1.00, 1.00) < .001
ML2 =~ VideosRealistic -0.74 0.16 (-1.06, -0.43) -4.62 < .001
ML2 =~ VideosIssues 0.93 0.18 (0.57, 1.28) 5.15 < .001
# Correlation
Link Coefficient SE 95% CI z p
ML1 ~~ ML2 -0.01 3.21e-03 (-0.02, -7.43e-03) -4.28 < .001

Exploratory Graph Analysis (EGA) is a recently developed framework for psychometric assessment, that can be used to estimate the number of dimensions in questionnaire data using network estimation methods and community detection algorithms, and assess the stability of dimensions and items using bootstrapping.

Unique Variable Analysis (UVA)

Unique Variable Analysis (Christensen, Garrido, & Golino, 2023) uses the weighted topological overlap measure (Nowick et al., 2009) on an estimated network. Values greater than 0.25 are determined to have considerable local dependence (i.e., redundancy) that should be handled (variables with the highest maximum weighted topological overlap to all other variables (other than the one it is redundant with) should be removed).

Code
uva <- EGAnet::UVA(data = bait, cut.off = 0.3)
uva
Variable pairs with wTO > 0.30 (large-to-very large redundancy)

     node_i        node_j   wto
 TextIssues TextRealistic 0.322

----

Variable pairs with wTO > 0.25 (moderate-to-large redundancy)

----

Variable pairs with wTO > 0.20 (small-to-moderate redundancy)

          node_i       node_j   wto
 VideosRealistic VideosIssues 0.212
Code
uva$keep_remove
$keep
[1] "TextIssues"

$remove
[1] "TextRealistic"

Networks

Code
ega <- list()
for(model in c("glasso", "TMFG")) {
  for(algo in c("walktrap", "louvain")) {
    for(type in c("ega", "ega.fit")) {  # "hierega"
      if(type=="ega.fit" & algo=="louvain") next  # Too slow
      ega[[paste0(model, "_", algo, "_", type)]] <- EGAnet::bootEGA(
        data = bait,
        seed=123,
        model=model,
        algorithm=algo,
        EGA.type=type,
        type="resampling",
        plot.itemStability=FALSE,
        verbose=FALSE)
      }
   }
}

EGAnet::compare.EGA.plots(
  ega$glasso_walktrap_ega, ega$glasso_walktrap_ega.fit,
  ega$glasso_louvain_ega, ega$TMFG_louvain_ega,
  ega$TMFG_walktrap_ega, ega$TMFG_walktrap_ega.fit,
  labels=c("glasso_walktrap_ega", "glasso_walktrap_ega.fit",
           "glasso_louvain_ega", "TMFG_louvain_ega",
           "TMFG_walktrap_ega", "TMFG_walktrap_ega.fit"),
                          rows=3,
  plot.all = FALSE)$all

Structure Stability

Figures shows how often each variable is replicating in their empirical structure across bootstraps.

Code
patchwork::wrap_plots(lapply(ega, plot), nrow = 3)

Final Model

Code
ega <- ega$TMFG_walktrap_ega$EGA
plot(ega)

Code
ega_scores <-  EGAnet::net.scores(bait, ega)$scores$std.scores |> 
  as.data.frame() |> 
  setNames(c("EGA1", "EGA2")) 
# Merge with data
scores <- lavaan::predict(m1) |> 
  as.data.frame() |> 
  data_rename(c("ML1", "ML2"), c("BAIT_SEM1", "BAIT_SEM2")) |> 
  cbind(ega_scores) |> 
  mutate(Participant = df$Participant) |>
  mutate(BAIT = rowMeans(select(bait, -contains("Videos")), na.rm = TRUE))

df <- full_join(df, scores, by="Participant")

We computed two type of general scores for the BAIT scale, an empirical score based on the average of observed data (of the most loading items) and a model-based score as predicted by the structural model. The first one gives equal weight to all items (and keeps the same [0-1] range), while the second one is based on the factor loadings and the covariance structure.

Code
correlation::cor_test(scores, "BAIT_SEM2", "BAIT") |> 
  plot() +
  ggside::geom_xsidedensity(aes(x=BAIT_SEM2), color="grey", linewidth=1) +
  ggside::geom_ysidedensity(aes(y=BAIT), color="grey", linewidth=1) +
  ggside::scale_xsidey_continuous(expand = c(0, 0)) +
  ggside::scale_ysidex_continuous(expand = c(0, 0)) +
  ggside::theme_ggside_void() +
  theme(ggside.panel.scale = .1) 

While the two correlate substantially, they have different benefits. The empirical score has a more straightforward meaning and is more reproducible (as it is not based on a model fitted on a specific sample), the model-based score takes into account the relative importance of the contribution of each item to their factor.

Code
table <- correlation::correlation(scores) |> 
  summary()

format(table) |> 
  datawizard::data_rename("Parameter", "Variables") |> 
  gt::gt() |> 
  gt::cols_align(align="center") |> 
  gt::tab_options(column_labels.font.weight="bold")
Variables BAIT EGA2 EGA1 BAIT_SEM2
BAIT_SEM1 -0.61*** -0.02 -0.92*** -0.83***
BAIT_SEM2 0.65*** 0.12 0.78***
EGA1 0.75*** 0.11
EGA2 0.21**

Corrrelation with GAAIS

Code
table <- correlation::correlation(scores, 
                         select(df, starts_with("GAAIS")),
                         bayesian=TRUE) |> 
  summary()

format(table) |> 
  datawizard::data_rename("Parameter", "Variables") |> 
  gt::gt() |> 
  gt::cols_align(align="center") |> 
  gt::tab_options(column_labels.font.weight="bold")
Variables GAAIS_Positive_17 GAAIS_Negative_15 GAAIS_Negative_10 GAAIS_Positive_7 GAAIS_Positive_12 GAAIS_Negative_9
BAIT_SEM1 -0.23*** 0.03 0.09 -0.18** -0.22*** -0.16**
BAIT_SEM2 0.14** 0.10 7.44e-03 0.03 0.11 0.19**
EGA1 0.29*** -0.04 -0.08 0.22** 0.21*** 0.16**
EGA2 -0.02 0.10 0.12* -0.02 -0.02 0.09
BAIT 0.24*** 0.05 0.04 0.08 0.18** 0.20***

AI-Expertise

Code
df |> 
  ggplot(aes(x=as.factor(AI_Knowledge), y=BAIT)) +
  geom_boxplot()

Code
# m <- betareg::betareg(BAIT ~ AI_Knowledge, data=df)
m <- lm(BAIT ~ AI_Knowledge, data=df)
# m <- brms::brm(BAIT ~ mo(AI_Knowledge), data=df, algorithm = "meanfield")
# m <- brms::brm(BAIT ~ AI_Knowledge, data=dfsub, algorithm = "meanfield")
display(parameters::parameters(m))
Parameter Coefficient SE 95% CI t(252) p
(Intercept) 0.58 0.02 (0.53, 0.62) 26.20 < .001
AI Knowledge 6.20e-03 5.97e-03 (-5.56e-03, 0.02) 1.04 0.300
Code
marginaleffects::predictions(m, by=c("AI_Knowledge"), newdata = "marginalmeans") |> 
  as.data.frame() |> 
  ggplot(aes(x=AI_Knowledge, y=estimate)) +
  geom_jitter2(data=df, aes(y=BAIT), alpha=0.2, width=0.1) +
  geom_line(aes(group=1), position = position_dodge(width=0.2)) +
  geom_pointrange(aes(ymin = conf.low, ymax=conf.high), position = position_dodge(width=0.2)) +
  theme_minimal() +
  labs(x = "AI-Knowledge", y="BAIT Score")

Gender and Age

Code
# m <- betareg::betareg(BAIT ~ Sex / Age, data=df, na.action=na.omit)
m <- lm(BAIT ~ Sex / Age, data=df)
display(parameters::parameters(m))
Parameter Coefficient SE 95% CI t(250) p
(Intercept) 0.62 0.04 (0.55, 0.69) 17.62 < .001
Sex (Male) -0.01 0.04 (-0.10, 0.07) -0.32 0.751
Sex (Female) × Age 3.29e-04 1.15e-03 (-1.93e-03, 2.59e-03) 0.29 0.775
Sex (Male) × Age -6.39e-04 6.16e-04 (-1.85e-03, 5.75e-04) -1.04 0.301

Belief in the Instructions

Code
glm(Feedback_LabelsIncorrect ~ BAIT * AI_Knowledge, 
    data=mutate(df, Feedback_LabelsIncorrect = ifelse(Feedback_LabelsIncorrect=="True", 1, 0)), 
    family="binomial") |> 
  parameters::parameters() |> 
  display(title="Predicting 'Labels are Incorrect'")
Predicting ‘Labels are Incorrect’
Parameter Log-Odds SE 95% CI z p
(Intercept) 2.04 2.50 (-2.79, 7.09) 0.82 0.414
BAIT -5.86 4.25 (-14.52, 2.23) -1.38 0.168
AI Knowledge -0.30 0.66 (-1.61, 0.99) -0.45 0.652
BAIT × AI Knowledge 0.98 1.11 (-1.16, 3.21) 0.88 0.378
Code
glm(Feedback_LabelsReversed ~ BAIT * AI_Knowledge, 
    data=mutate(df, Feedback_LabelsReversed = ifelse(Feedback_LabelsReversed=="True", 1, 0)), 
    family="binomial") |> 
  parameters::parameters() |> 
  display(title="Predicting 'Labels are reversed'")
Predicting ‘Labels are reversed’
Parameter Log-Odds SE 95% CI z p
(Intercept) -5.09 3.57 (-12.35, 1.75) -1.43 0.154
BAIT 5.78 5.57 (-5.18, 16.94) 1.04 0.300
AI Knowledge 0.19 1.02 (-1.79, 2.21) 0.18 0.854
BAIT × AI Knowledge -0.67 1.59 (-3.85, 2.40) -0.42 0.671
Code
glm(Feedback_CouldDiscriminate ~ BAIT * AI_Knowledge, 
    data=mutate(df, Feedback_CouldDiscriminate = ifelse(Feedback_CouldDiscriminate=="True", 1, 0)), 
    family="binomial") |> 
  parameters::parameters() |> 
  display(title="Predicting 'Easy to discriminate'")
Predicting ‘Easy to discriminate’
Parameter Log-Odds SE 95% CI z p
(Intercept) -1.87 7.33 (-15.02, 12.02) -0.26 0.798
BAIT -3.47 13.23 (-29.82, 18.36) -0.26 0.793
AI Knowledge 0.42 1.97 (-3.27, 3.95) 0.21 0.833
BAIT × AI Knowledge -0.59 3.56 (-6.84, 6.06) -0.17 0.869
Code
glm(Feedback_CouldNotDiscriminate ~ BAIT * AI_Knowledge, 
    data=mutate(df, Feedback_CouldNotDiscriminate = ifelse(Feedback_CouldNotDiscriminate=="True", 1, 0)), 
    family="binomial") |> 
  parameters::parameters() |> 
  display(title="Predicting 'Hard to discriminate'")
Predicting ‘Hard to discriminate’
Parameter Log-Odds SE 95% CI z p
(Intercept) -1.02 2.28 (-5.57, 3.47) -0.45 0.656
BAIT 1.07 3.76 (-6.36, 8.55) 0.29 0.776
AI Knowledge -0.09 0.62 (-1.32, 1.14) -0.15 0.884
BAIT × AI Knowledge 0.07 1.02 (-1.95, 2.08) 0.07 0.947
Code
glm(Feedback_Fun ~ BAIT * AI_Knowledge, 
    data=mutate(df, Feedback_Fun = ifelse(Feedback_Fun=="True", 1, 0)), 
    family="binomial") |> 
  parameters::parameters() |> 
  display(title="Predicting 'Fun'")
Predicting ‘Fun’
Parameter Log-Odds SE 95% CI z p
(Intercept) 1.35 2.24 (-3.02, 5.86) 0.60 0.548
BAIT -2.77 3.72 (-10.30, 4.45) -0.74 0.457
AI Knowledge -0.22 0.61 (-1.44, 0.97) -0.36 0.722
BAIT × AI Knowledge 0.58 1.00 (-1.37, 2.61) 0.58 0.561
Code
glm(Feedback_Boring ~ BAIT * AI_Knowledge, 
    data=mutate(df, Feedback_Boring = ifelse(Feedback_Boring=="True", 1, 0)), 
    family="binomial") |> 
  parameters::parameters() |> 
  display(title="Predicting 'Boring'")
Predicting ‘Boring’
Parameter Log-Odds SE 95% CI z p
(Intercept) -2.48 3.31 (-8.98, 3.99) -0.75 0.453
BAIT -0.70 5.53 (-11.79, 9.85) -0.13 0.899
AI Knowledge 0.40 0.84 (-1.23, 2.06) 0.48 0.629
BAIT × AI Knowledge -0.04 1.39 (-2.74, 2.71) -0.03 0.978

Save

write.csv(df, "../data/data_participants.csv", row.names = FALSE)
write.csv(dftask, "../data/data.csv", row.names = FALSE)